EPA HAP Data Explorer

Explore the data contained in the EPA Hazardous Air Pollutants dataset. This dataset contains a summary of daily Hazardous Air Pollutants from 1990 to 2017.

Download the data from Kaggle.


In [1]:
import pandas as pd
from pivottablejs import pivot_ui

In [2]:
# CHANGE THIS TO THE LOCATION OF THE FILE
csv_file = "/Users/robert.dempsey/Downloads/epa_hap_daily_summary.csv"

In [3]:
hap_df = pd.DataFrame.from_csv(csv_file)

In [4]:
# Get the row and column counts
rows_cols = hap_df.shape
print("Rows: {}".format(rows_cols[0]))
print("Columns: {}".format(rows_cols[1]))


Rows: 8097069
Columns: 28

In [5]:
# List the columns and their types
hap_df.dtypes


Out[5]:
county_code              int64
site_num                 int64
parameter_code           int64
poc                      int64
latitude               float64
longitude              float64
datum                   object
parameter_name          object
sample_duration         object
pollutant_standard     float64
date_local              object
units_of_measure        object
event_type              object
observation_count        int64
observation_percent    float64
arithmetic_mean        float64
first_max_value        float64
first_max_hour           int64
aqi                    float64
method_code              int64
method_name             object
local_site_name         object
address                 object
state_name              object
county_name             object
city_name               object
cbsa_name               object
date_of_last_change     object
dtype: object

In [6]:
# Show summary statistics
hap_df.describe()


Out[6]:
county_code site_num parameter_code poc latitude longitude pollutant_standard observation_count observation_percent arithmetic_mean first_max_value first_max_hour aqi method_code
count 8.097069e+06 8.097069e+06 8.097069e+06 8.097069e+06 8.097069e+06 8.097069e+06 0.0 8.097069e+06 8.097069e+06 8.097069e+06 8.097069e+06 8.097069e+06 0.0 8.097069e+06
mean 8.995137e+01 1.370898e+03 6.042453e+04 2.835583e+00 3.818801e+01 -9.434529e+01 NaN 1.572091e+00 9.880965e+01 4.466537e-01 5.618971e-01 7.211276e-01 NaN 4.101036e+02
std 1.008587e+02 2.746745e+03 2.560563e+04 2.905911e+00 5.968574e+00 1.716207e+01 NaN 3.434705e+00 8.590761e+00 1.247981e+01 1.306216e+01 3.240937e+00 NaN 3.298626e+02
min 1.000000e+00 1.000000e+00 1.210300e+04 1.000000e+00 0.000000e+00 -1.605083e+02 NaN 1.000000e+00 1.000000e+00 -1.000000e-01 -1.000000e-01 0.000000e+00 NaN 8.900000e+01
25% 2.700000e+01 8.000000e+00 4.380400e+04 1.000000e+00 3.394471e+01 -1.066766e+02 NaN 1.000000e+00 1.000000e+02 0.000000e+00 0.000000e+00 0.000000e+00 NaN 1.160000e+02
50% 6.300000e+01 4.200000e+01 4.386000e+04 1.000000e+00 3.892185e+01 -9.278027e+01 NaN 1.000000e+00 1.000000e+02 8.900000e-04 8.900000e-04 0.000000e+00 NaN 1.750000e+02
75% 1.190000e+02 1.007000e+03 8.811200e+04 5.000000e+00 4.219438e+01 -8.096230e+01 NaN 1.000000e+00 1.000000e+02 4.000000e-02 4.000000e-02 0.000000e+00 NaN 8.000000e+02
max 8.400000e+02 9.998000e+03 8.813600e+04 9.900000e+01 6.709195e+01 0.000000e+00 NaN 9.600000e+01 1.000000e+02 2.000000e+04 2.000000e+04 2.300000e+01 NaN 9.230000e+02

In [7]:
# View the last five rows of the dataframe
hap_df.tail()


Out[7]:
county_code site_num parameter_code poc latitude longitude datum parameter_name sample_duration pollutant_standard ... aqi method_code method_name local_site_name address state_name county_name city_name cbsa_name date_of_last_change
state_code
53 33 80 43804 6 47.568236 -122.308628 WGS84 Carbon tetrachloride 24 HOUR NaN ... NaN 101 CANISTER SUBAMBIENT PRESSURE - MULTI DETECTOR GC SEATTLE - BEACON HILL 4103 BEACON HILL S Washington King Seattle Seattle-Tacoma-Bellevue WA 2015-03-17
6 19 8 45201 5 36.781333 -119.773190 NAD83 Benzene 24 HOUR NaN ... NaN 171 6L Pressurized Canister - Precon Saturn GC/MS 3425 N FIRST ST FRESNO 3425 N FIRST ST FRESNO California Fresno Fresno Fresno CA 2013-06-11
13 21 12 43802 1 32.805408 -83.543521 WGS84 Dichloromethane 24 HOUR NaN ... NaN 150 SS 6L- PRESSURIZED CANISTER - CRYOGENIC PRECON... Macon SE Georgia Forestry Commission 5645 Riggins Mill ... Georgia Bibb Macon Macon GA 2013-06-11
53 47 12 88103 1 48.387531 -119.928671 WGS84 Arsenic PM2.5 LC 24 HOUR NaN ... NaN 800 IMPROVE Module A with Cyclone Inlet-Teflon Fil... NaN Pasayten Washington Okanogan Not in a city NaN 2015-07-28
33 15 14 88136 5 43.075333 -70.748000 WGS84 Nickel PM2.5 LC 24 HOUR NaN ... NaN 821 Andersen RAAS Teflon - Energy Dispersive XRF PORTSMOUTH - PEIRCE ISLAND PORTSMOUTH PEIRCE ISLAND New Hampshire Rockingham Portsmouth Boston-Cambridge-Newton MA-NH 2015-07-22

5 rows × 28 columns


In [8]:
# Print all columns and values for the last row in the dataset
hap_df.tail(1).to_dict(orient='records')


Out[8]:
[{'address': 'PORTSMOUTH PEIRCE ISLAND',
  'aqi': nan,
  'arithmetic_mean': 0.0,
  'cbsa_name': 'Boston-Cambridge-Newton MA-NH',
  'city_name': 'Portsmouth',
  'county_code': 15,
  'county_name': 'Rockingham',
  'date_local': '2004-03-13',
  'date_of_last_change': '2015-07-22',
  'datum': 'WGS84',
  'event_type': 'None',
  'first_max_hour': 0,
  'first_max_value': 0.0,
  'latitude': 43.075333,
  'local_site_name': 'PORTSMOUTH - PEIRCE ISLAND',
  'longitude': -70.748,
  'method_code': 821,
  'method_name': 'Andersen RAAS Teflon - Energy Dispersive XRF',
  'observation_count': 1,
  'observation_percent': 100.0,
  'parameter_code': 88136,
  'parameter_name': 'Nickel PM2.5 LC',
  'poc': 5,
  'pollutant_standard': nan,
  'sample_duration': '24 HOUR',
  'site_num': 14,
  'state_name': 'New Hampshire',
  'units_of_measure': 'Micrograms/cubic meter (LC)'}]

In [9]:
# Check to see if there are any null values in any of the columns
null_values_present = hap_df.isnull().values.any()

if null_values_present:
    nan_rows = hap_df[hap_df.isnull().T.any().T]
    print("Rows with empty values: {}".format(len(nan_rows)))
    print("Filling empty values")
    hap_df.fillna(0, inplace=True)
    print("Empty values filled")


Rows with empty values: 8097069
Filling empty values
Empty values filled

In [10]:
# Create a dataframe from the last 60 rows and use PivotTable.js to view the data
hap_pivot_df = hap_df.tail(60)

# Create a column from the timestamp index and create a new numerical index
hap_pivot_df.reset_index(level=0, inplace=True)

# Show the pivot table
pivot_ui(hap_pivot_df)


Out[10]:

In [ ]: